The purpose of exploratory analysis is to perform initial investigations on the data so as to discover patterns, identify any anomalies, to test initial hypothesis and check assumptions made at the start of the project.
In this section of the project we will ask two types of questions:
Variation being the tendency for values of a variable to change from measurement to measurement, and covariance is the tendency for two or more variables to vary together in a related way.
pacman::p_load(
tidyverse,
here,
RColorBrewer,
lubridate,
scales,
GGally,
stats,
corrplot,
mice,
VIM
)
flights_dt <- read_csv(here("clean_data/flights_clean.csv"))
Rows: 327346 Columns: 38
── Column specification ─────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (11): origin, origin_name, dest, dest_name, carrier, carrier_name, tailnum, manufacturer, model, engine, type
dbl (22): dep_delay, arr_delay, air_time, distance, flight, engines, seats, aircraft_age, lat, lon, alt, wind_di...
dttm (5): dep_time, sched_dep_time, arr_time, sched_arr_time, time_hour
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
flights_dt %>%
group_by(origin) %>%
ggplot() +
aes(x = origin, fill = origin) +
geom_bar() +
scale_fill_manual(values = cbPalette) +
labs(
x = "airport",
y = "departure numbers",
title = "Departure numbers by airport"
) +
guides(fill = "none") +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = month, y = mean_delay, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "mean delay (minutes)",
title = " Mean delay by month",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(month = floor_date(sched_dep_time, "month"), origin) %>%
summarise(no_of_delays = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_delays, colour = origin) +
geom_line(alpha = 0.6) +
geom_point(alpha = 0.8) +
labs(
x = "month",
y = "number of delays",
title = " Monthly trend of departure delays",
colour = "airport"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
mutate(month = month(sched_dep_time, label = TRUE)) %>%
group_by(month, origin) %>%
summarise(no_of_flights = n(), .groups = "drop") %>%
ggplot() +
aes(x = month, y = no_of_flights, fill = origin) +
geom_col(position = "dodge", alpha = 0.8) +
labs(
x = "month",
y = "number of departures",
title = "Departure numbers by month",
fill = "airport"
) +
scale_fill_manual(values = cbPalette) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean departure delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()
flights_dt %>%
filter(arr_delay > 0) %>%
group_by(carrier_name, origin) %>%
summarise(mean_delay = mean(arr_delay), .groups = "drop") %>%
ggplot() +
aes(x = reorder(carrier_name, mean_delay), y = mean_delay, fill = origin) +
geom_col(alpha = 0.8) +
facet_wrap(~ origin) +
labs(
x = "carrier",
y = "mean delay (minutes)",
title = "Mean arrival delay by carrier",
fill = "airport"
) +
theme_bw() +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
theme(axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 1)) +
coord_flip()
flights_dt %>%
filter(dep_delay > 0) %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = day, y = mean_delay, colour = origin) +
geom_point(alpha = 0.8) +
labs(
x = "date",
y = "mean delay (minutes)",
title = "Mean departure delay by month"
) +
scale_colour_manual(values = c("#999999", "#E69F00", "#56B4E9")) +
theme_bw()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, hour) %>%
group_by(hour) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
ggplot() +
aes(x = hour, y = mean_delay) +
geom_point(alpha = 0.8, colour = "#56B4E9") +
geom_smooth(se = FALSE, colour = "#E69F00") +
labs(
x = "hour",
y = "mean delay (minutes)",
title = "Mean departure delay by departure time"
) +
theme_bw() +
scale_x_continuous(limits = c(5, 23), breaks = seq(5, 23, by = 1))
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
mutate(weekday = wday(sched_dep_time, label = TRUE)) %>%
group_by(weekday) %>%
summarise(mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = weekday, y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "weekday",
y = "mean departure delay (minutes)",
title = "Departure delay by weekday"
) +
theme_bw()
flights_dt %>%
filter(origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(desc(count)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, count), y = count) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "number of flights per year",
title = "Destinations from Newark Int."
) +
theme_bw() +
coord_flip()
flights_dt %>%
drop_na(dest_name) %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(dest, dest_name) %>%
summarise(mean_delay = mean(dep_delay, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(mean_delay)) %>%
head(10) %>%
ggplot() +
aes(x = reorder(dest_name, mean_delay), y = mean_delay) +
geom_col(fill = "#999999", alpha = 0.8) +
labs(
x = "destination",
y = "mean delay (minutes)",
title = "Destinations with longest delays"
) +
theme_bw() +
coord_flip()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_visibility = mean(visib, na.rm = TRUE),
mean_delay = mean(dep_delay), .groups = "drop") %>%
ggplot() +
aes(x = mean_visibility, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean visibility (miles)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by visibility"
) +
theme_bw()
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_speed = mean(wind_speed, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_speed, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind speed (mph)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind speed"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_wind_dir = mean(wind_dir, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_wind_dir, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean wind direction (degrees)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by wind direction"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_humidity = mean(humid, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_humidity, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean humidity (%)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by humidity"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_temp = mean(temp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_temp, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean temperature (degf)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by temperature"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_dewpoint = mean(dewp, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_dewpoint, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean dewpoint (degF)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by dewpoint"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_precip = mean(precip, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_precip, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean precipitation (inches)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by precipitation"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
group_by(day = floor_date(sched_dep_time, "day"), origin) %>%
summarise(mean_pressure = mean(pressure, na.rm = TRUE),
mean_delay = mean(dep_delay)) %>%
ggplot() +
aes(x = mean_pressure, y = mean_delay) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, colour = "#E69F00") +
labs(
x = "mean pressure (mbar)",
y = "mean departure delay (minutes)",
title = "Mean departure delay by pressure"
) +
theme_bw()
`summarise()` has grouped output by 'day'. You can override using the `.groups` argument.
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 1 rows containing non-finite values (stat_smooth).
Warning: Removed 1 rows containing missing values (geom_point).
# subset the flights dataset so as to drop NAs, filter on only departure delays (no early departures) and only for Newark Int.
flights_weather <- flights_dt %>%
na.omit() %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)
# create correlation matrix
cor_matrix <- cor(flights_weather)
corrplot(cor_matrix, type = "upper", col = cbPalette,
tl.col = "black", method = "number")
ggpairs(flights_weather)
model_1_dt <- flights_dt %>%
na.omit() %>%
filter(dep_delay > 0 & origin == "EWR") %>%
select(dep_delay, temp, dewp, humid, wind_dir, wind_speed, precip, pressure, visib)
model_1 <- lm(dep_delay ~ ., data = model_1_dt)
# view results
summary(model_1)
Call:
lm(formula = dep_delay ~ ., data = model_1_dt)
Residuals:
Min 1Q Median 3Q Max
-59.90 -29.00 -16.77 10.88 815.29
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 631.659739 40.355182 15.653 < 2e-16 ***
temp 0.463713 0.135562 3.421 0.000625 ***
dewp -0.435947 0.145838 -2.989 0.002798 **
humid 0.467200 0.076075 6.141 8.26e-10 ***
wind_dir -0.017681 0.002832 -6.243 4.33e-10 ***
wind_speed 0.486040 0.054098 8.985 < 2e-16 ***
precip -8.181936 17.295980 -0.473 0.636177
pressure -0.623965 0.038058 -16.395 < 2e-16 ***
visib 0.297599 0.213728 1.392 0.163802
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 48.57 on 40553 degrees of freedom
Multiple R-squared: 0.025, Adjusted R-squared: 0.02481
F-statistic: 130 on 8 and 40553 DF, p-value: < 2.2e-16
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = humid, y = dewp) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = temp, y = dewp) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = visib, y = humid) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = dewp, y = pressure) +
geom_point()
flights_dt %>%
filter(dep_delay > 0 & origin == "EWR") %>%
na.omit() %>%
ggplot() +
aes(x = humid, y = wind_speed) +
geom_point()
summary(model_3)
Call:
lm(formula = dep_delay ~ air_time + aircraft_age + alt + hour +
dewp + pressure + dewp:humid + pressure:dewp + dewp:temp +
wind_speed:humid, data = model_2_dt)
Residuals:
Min 1Q Median 3Q Max
-86.30 -27.15 -13.92 9.77 815.55
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.705e+02 7.388e+01 -6.369 1.92e-10 ***
air_time -5.057e-02 2.425e-03 -20.853 < 2e-16 ***
aircraft_age 2.366e-01 4.810e-02 4.918 8.79e-07 ***
alt 1.059e-03 2.286e-04 4.633 3.62e-06 ***
hour 1.737e+00 5.498e-02 31.597 < 2e-16 ***
dewp 2.577e+01 1.827e+00 14.105 < 2e-16 ***
pressure 4.728e-01 7.222e-02 6.547 5.93e-11 ***
dewp:humid 7.640e-03 4.174e-04 18.303 < 2e-16 ***
dewp:pressure -2.607e-02 1.768e-03 -14.745 < 2e-16 ***
dewp:temp 3.842e-03 6.260e-04 6.138 8.43e-10 ***
humid:wind_speed 7.562e-03 8.812e-04 8.581 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 47.4 on 40551 degrees of freedom
Multiple R-squared: 0.07124, Adjusted R-squared: 0.07101
F-statistic: 311.1 on 10 and 40551 DF, p-value: < 2.2e-16
glmulti_fit <- glmulti(
dep_delay ~ .,
data = flights_weather,
level = 2, # 2 = include pairwise interactions, 1 = main effects only (main effect = no pairwise interactions)
minsize = 0, # no min size of model
maxsize = -1, # -1 = no max size of model
marginality = TRUE, # marginality here means the same as 'strongly hierarchical' interactions, i.e. include pairwise interactions only if both predictors present in the model as main effects.
method = "g", # the problem is too large for exhaustive search, so search using a genetic algorithm
crit = bic, # criteria for model selection is BIC value (lower is better)
plotty = FALSE, # don't plot models as function runs
report = TRUE, # do produce reports as function runs
confsetsize = 10, # return best 100 solutions
fitfunction = lm # fit using the `lm` function
)
flights_decision_tree <- flights_dt %>%
filter(origin == "EWR") %>%
select(sched_dep_time, dep_delay, distance, engines, engine,
aircraft_age, lat, lon, alt, hour, temp, precip, humid, visib, pressure,
dewp, wind_dir, wind_speed) %>%
mutate(engines = factor(engines),
engine = factor(engine),
month = month(sched_dep_time, label = TRUE),
late_departure = factor(ifelse(dep_delay > 0, "Late", "Ontime/Early"))) %>%
mutate(season = factor(case_when(
month %in% c("Dec", "Jan", "Feb") ~ "Winter",
month %in% c("Mar", "Apr", "May") ~ "Spring",
month %in% c("Jun", "Jul", "Aug") ~ "Summer",
month %in% c("Sep", "Oct", "Nov") ~ "Autumn"
))) %>%
select(-c(dep_delay, sched_dep_time)) %>%
na.omit()
str(flights_decision_tree)
tibble [91,968 × 19] (S3: tbl_df/tbl/data.frame)
$ distance : num [1:91968] 1400 719 1065 2227 1023 ...
$ engines : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
$ engine : Factor w/ 6 levels "4 Cycle","Reciprocating",..: 3 3 3 3 3 3 3 3 4 3 ...
$ aircraft_age : num [1:91968] 22 9 21 13 15 31 12 20 22 19 ...
$ lat : num [1:91968] 30 42 26.1 36.1 26.7 ...
$ lon : num [1:91968] -95.3 -87.9 -80.2 -115.2 -80.1 ...
$ alt : num [1:91968] 97 668 9 2141 19 ...
$ hour : num [1:91968] 5 5 6 6 6 6 6 6 6 6 ...
$ temp : num [1:91968] 39 39 37.9 37.9 37.9 ...
$ precip : num [1:91968] 0 0 0 0 0 0 0 0 0 0 ...
$ humid : num [1:91968] 64.4 64.4 67.2 67.2 67.2 ...
$ visib : num [1:91968] 10 10 10 10 10 10 10 10 10 10 ...
$ pressure : num [1:91968] 1012 1012 1012 1012 1012 ...
$ dewp : num [1:91968] 28 28 28 28 28 ...
$ wind_dir : num [1:91968] 260 260 240 240 240 240 240 240 240 240 ...
$ wind_speed : num [1:91968] 12.7 12.7 11.5 11.5 11.5 ...
$ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 1 1 1 1 1 1 1 1 1 1 ...
$ late_departure: Factor w/ 2 levels "Late","Ontime/Early": 1 2 2 2 1 2 2 2 2 2 ...
$ season : Factor w/ 4 levels "Autumn","Spring",..: 4 4 4 4 4 4 4 4 4 4 ...
- attr(*, "na.action")= 'omit' Named int [1:25159] 4 9 14 22 26 31 49 58 59 61 ...
..- attr(*, "names")= chr [1:25159] "4" "9" "14" "22" ...
# set.seed(19)
n_data <- nrow(flights_decision_tree)
# create test sample index
test_index <- sample(1:n_data, size = n_data * 0.2)
# create test set
flights_test <- slice(flights_decision_tree, test_index)
# create training set
flights_train <- slice(flights_decision_tree, -test_index)
# check split of the data
flights_test %>%
janitor::tabyl(late_departure)
late_departure n percent
Late 8117 0.4413092
Ontime/Early 10276 0.5586908
flights_train %>%
janitor::tabyl(late_departure)
late_departure n percent
Late 32445 0.4409786
Ontime/Early 41130 0.5590214
flights_fit <- rpart(
formula = late_departure ~ .,
data = flights_train,
method = 'class'
)
rpart.plot(flights_fit,
yesno = 2,
fallen.leaves = TRUE,
faclen = 2,
digits = 4)
conf_mat <- flights_test_pred %>%
conf_mat(truth = late_departure, estimate = pred)
conf_mat
Truth
Prediction Late Ontime/Early
Late 4484 3136
Ontime/Early 3633 7140
confusionMatrix(flights_test_pred$pred, flights_test_pred$late_departure)
Confusion Matrix and Statistics
Reference
Prediction Late Ontime/Early
Late 4484 3136
Ontime/Early 3633 7140
Accuracy : 0.632
95% CI : (0.625, 0.639)
No Information Rate : 0.5587
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.2488
Mcnemar's Test P-Value : 1.653e-09
Sensitivity : 0.5524
Specificity : 0.6948
Pos Pred Value : 0.5885
Neg Pred Value : 0.6628
Prevalence : 0.4413
Detection Rate : 0.2438
Detection Prevalence : 0.4143
Balanced Accuracy : 0.6236
'Positive' Class : Late
rf_classifier <- ranger(late_departure ~ .,
data = flights_train,
importance = "impurity",
num.trees = 1000,
mtry = 2,
min.node.size = 5)
rf_classifier
Ranger result
Call:
ranger(late_departure ~ ., data = flights_train, importance = "impurity", num.trees = 1000, mtry = 2, min.node.size = 5)
Type: Classification
Number of trees: 1000
Sample size: 73575
Number of independent variables: 18
Mtry: 2
Target node size: 5
Variable importance mode: impurity
Splitrule: gini
OOB prediction error: 32.36 %
importance(rf_classifier)
distance engines engine aircraft_age lat lon alt hour temp
1336.15416 13.07388 309.95871 1803.02661 1302.03732 1319.34020 1322.86372 2841.56151 1747.15827
precip humid visib pressure dewp wind_dir wind_speed month season
125.14775 1967.98670 306.21730 2103.86915 1727.47836 1557.39937 1347.14084 987.72526 579.43656
flights_test_pred <- flights_test %>%
mutate(pred = predict(rf_classifier, data = flights_test)$predictions)
confusionMatrix(flights_test_pred$pred, flights_test_pred$late_departure)
Confusion Matrix and Statistics
Reference
Prediction Late Ontime/Early
Late 4360 2204
Ontime/Early 3724 8105
Accuracy : 0.6777
95% CI : (0.6709, 0.6845)
No Information Rate : 0.5605
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.3323
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.5393
Specificity : 0.7862
Pos Pred Value : 0.6642
Neg Pred Value : 0.6852
Prevalence : 0.4395
Detection Rate : 0.2370
Detection Prevalence : 0.3569
Balanced Accuracy : 0.6628
'Positive' Class : Late
importance(flights_test_pred)
Error in UseMethod("importance") :
no applicable method for 'importance' applied to an object of class "c('tbl_df', 'tbl', 'data.frame')"